/**********************************************************************/
/*
      Title: fake_kab_randomization_iterate.do
			Author: Robbie Dulin, Clotaire Boyer
			Created: 10/2/2019
    	Description: Creates 1000 resamples of the treatment assignments for
        randomization inference.
      Output: resample1000.dta
*/
/**********************************************************************/

/*----------------------------------------------------*/
                  /* Set Strata */
/*----------------------------------------------------*/
*** Create the holdout sample (and thus the strata) using the original seed

version 14.2

cap log close
local prefix: display %tdCYND td(`c(current_date)')
log using "$log/`prefix'_fake_kab_randomization_iterate", replace

local treatedprobability = 0.3
local target = 8300000 /*target # beneficiaries*/
local numleaveout = 20 /*how many to leaveout to hit 8.3 million?*/
local bottomX = 60 /*what percentile to draw the leaveouts from */

** NOTE: I use original seed to replicate original strata assignments
// I change this seed below to randomize treatment assignments within these strata
set seed 18271234
set sortseed 18271234
set varabbrev on
set more off

clear

insheet using "$importable/kab_priority1_2_import.csv"

* clean up data
drop if no == .
count
assert r(N) == 180

replace jumlahkpm = subinstr(jumlahkpm,",","",.)
destring jumlahkpm, replace


* drop pilot locations
g pilot = 1 if namakabupaten == "GOWA"
replace pilot = 1 if namakabupaten == "DELI SERDANG"
replace pilot = 1 if namakabupaten == "BOYOLALI"
replace pilot = 1 if namakabupaten == "TABANAN"
replace pilot = 1 if namakabupaten == "LOMBOK TIMUR"
replace pilot = 1 if namakabupaten == "KEDIRI"
replace pilot = 1 if namakabupaten == "KULON PROGO"
replace pilot = 1 if namakabupaten == "SIDOARJO"
replace pilot = 1 if namakabupaten == "JEMBER"

* treat all java that are in top 120 in priority list
// replace pilot = 1 if (namaprov == "BANTEN" | namaprov == "JAWA BARAT" | namaprov == "D I YOGYAKARTA"  | namaprov == "JAWA TENGAH" | namaprov == "JAWA TIMUR") & no <= 80
replace pilot = 1 if (namaprov == "BANTEN" | namaprov == "JAWA BARAT" | namaprov == "D I YOGYAKARTA"  | namaprov == "JAWA TENGAH" | namaprov == "JAWA TIMUR") & no <= 80

egen temptotalkpmpilot = total(jumlahkpm) if pilot == 1
summ temptotalkpmpilot
local target_nonpilot = `target' - r(mean)
display `target_nonpilot' /*this is the number of beneficiaries we need to get to end up at `target' at the end*/

/* NOTE: treatments.xlsx file used in "170829 Phase in Randomization.do" is not produced by "170808 Kab Randomization.do" (instead it produces treatments.csv). An earlier version
		of "170808 Kab Randomization.do" must have produced it. The only difference is that treatments.xlsx includes variable pilotjava, which appears to be equal to 1 when pilot
		= 1 and javatreatforsure = 1. This produces 9 pilot districts. To make sure fake_treatments also has 9 districts, I create a separate pilotjava variable and replace pilot
		districts. */

* make sure there are 9 pilot districts
gen pilotjava = 1 if (namaprov == "BANTEN" | namaprov == "JAWA BARAT" | namaprov == "D I YOGYAKARTA"  | namaprov == "JAWA TENGAH" | namaprov == "JAWA TIMUR") & no <= 80
replace pilot = .
replace pilot = 1 if namakabupaten == "GOWA"
replace pilot = 1 if namakabupaten == "DELI SERDANG"
replace pilot = 1 if namakabupaten == "BOYOLALI"
replace pilot = 1 if namakabupaten == "TABANAN"
replace pilot = 1 if namakabupaten == "LOMBOK TIMUR"
replace pilot = 1 if namakabupaten == "KEDIRI"
replace pilot = 1 if namakabupaten == "KULON PROGO"
replace pilot = 1 if namakabupaten == "SIDOARJO"
replace pilot = 1 if namakabupaten == "JEMBER"

count if pilot == 1
assert `r(N)' == 9

tempfile temp_all
save `temp_all',replace

* don't randomize for 9 pilots and for Java
drop if pilot == 1 | pilotjava == 1
count
*assert r(N) == 91**

** Create holdout sample
*** Find the cutoff value
_pctile jumlahkpm, p(`bottomX')
return list

*** pick holdout sample
g holdoutsampleeligible = 1 if jumlahkpm <= r(r1)

sort jumlahkpm
g temprandom = uniform() if holdoutsampleeligible  == 1
egen rankrand1 = rank(temprandom) if holdoutsampleeligible  == 1
g holdoutsample = 1 if rankrand1 <= `numleaveout'
replace holdoutsample = 0 if holdoutsample != 1

preserve
keep if holdoutsample == 1
tempfile holdoutsamp
save `holdoutsamp'
restore
keep if holdoutsample == 0

*** Stratification
g provgroup = namaprov

replace provgroup = "SUMATERA_GROUP" if namaprov == "BENGKULU" | namaprov == "JAMBI" | namaprov == "LAMPUNG" | namaprov == "RIAU" | namaprov == "SUMATERA BARAT" | namaprov == "SUMATERA SELATAN" | namaprov == "KEP. BANGKA BELITUNG"
replace provgroup = "JABAR_BANTEN_JATENG_DYI" if namaprov == "BANTEN" | namaprov == "JAWA BARAT" | namaprov == "D I YOGYAKARTA"  | namaprov == "JAWA TENGAH"
replace provgroup = "KALIMANTAN" if namaprov == "KALIMANTAN BARAT" | namaprov  == "KALIMANTAN TENGAH" | namaprov  == "KALIMANTAN SELATAN" | namaprov == "KALIMANTAN TIMUR"
replace provgroup = "SULAWESI_NONSEL" if namaprov == "SULAWESI UTARA" | namaprov == "GORONTALO" | namaprov == "SULAWESI TENGGARA" | namaprov == "SULAWESI TENGAH"


egen stratum1 = group(provgroup)



** for big strata make smaller strata by indeksgab
g stratum2 = 1
sort stratum1 jumlahkpm
by stratum1: g rankwithinprov = _n
by stratum1: g numwithinprov = _N
g ptilewithinprov = (rankwithinprov-1) / numwithinprov

*subdivide large provinces into groups of at least 5
g numsubstratum = max(floor(numwithinprov/5),1)

replace stratum2 = floor(ptilewithinprov*numsubstratum)+1  if holdoutsample != 1

tab stratum1 stratum2

egen stratum = group(stratum1 stratum2)


/*----------------------------------------------------*/
                  /* Iterations */
/*----------------------------------------------------*/
/*
Take 1000 random draws of the treatment assignments using the same strata.
*/

// Set the seed again
// Original seed used above to replicate stratum assignments
// now different seed used to randomize within these strata



set seed 71263791
set sortseed 71263791
// set obs 1
// gen id = 1

qui forvalues n = 1 / 1000 {

  noi di "Iteration Number `n'"
  preserve

  ** Randomize within stratum for nonholdout sample
  bys stratum: egen numwithinstratum = count(stratum)

  ** Determine how many treated within each stratum
  g numtotreat = numwithinstratum * `treatedprobability'

  ** round up or round down based on remainder probability, so that on average everyone has probability of being treated of `treatedprobability'
  g remainder = numtotreat - floor(numtotreat)

  * order stratum from 1 to N based on remainder
  by stratum: g a = _n
  egen rankstrat = rank(remainder) if a == 1, unique
  qui tab stratum
  local numstrat = r(r)
  replace rankstrat = `numstrat' - rankstrat + 1

  sort no
  g temprand1 = uniform()
  g iteratedremainder = remainder if a == 1

  ** iterate through stratum from most highly probable to least assigning remainders
  foreach strat_i of num 1/`numstrat' {
  	g temptreated = 1 if temprand1 <= iteratedremainder & a == 1 & `strat_i' == rankstrat & iteratedremainder != .
  	replace temptreated = 0 if temptreated == .


  	replace numtotreat = floor(numtotreat) +  temptreated  if a == 1 & `strat_i' == rankstrat


  	* replace the rest of the remainders
  	qui sum iteratedremainder if a == 1 & `strat_i' == rankstrat
  	local realizedprob = r(mean)


  	egen tempoldsum = total(iteratedremainder)
  	replace iteratedremainder = 0 if a == 1 & `strat_i' == rankstrat
  	egen tempnewsum = total(iteratedremainder)

  	egen tempalltreated = max(temptreated)


  	replace iteratedremainder = iteratedremainder * tempoldsum/tempnewsum if tempalltreated == 0
  	replace iteratedremainder = iteratedremainder * (tempoldsum-1)/tempnewsum if tempalltreated == 1

  	egen tempnewit = total(iteratedremainder)
  	drop temptreated tempalltreated tempoldsum tempnewsum tempnewit
  }

  ** numtotreat is now correctif a == 1; make it correct everywhere
  g tempnumtotreat = numtotreat if a == 1
  bys stratum: egen tempnumtotreat_assign = mean(tempnumtotreat)
  assert tempnumtotreat_assign == numtotreat if a == 1
  replace numtotreat = tempnumtotreat_assign


  g percenttreated = numtotreat / numwithinstratum

  ***
  * Randomize
  ***

  sort stratum no
  g temp3 = uniform()
  sort stratum temp3
  by stratum: g treated = 1 if _n <= numtotreat
  replace treated = 0 if treated == .

//   tab stratum treated


  ***
  * Count total treated thus far
  ***

  egen totaltreatedmain = total(jumlahkpm) if treated == 1 | pilot == 1
  summ totaltreatedmain
  local tottreatedmain = r(mean)


  **
  * treat holdout sample
  **

  append using `holdoutsamp'
  local holdoutgoal = `target_nonpilot' - `tottreatedmain'
  display `holdoutgoal'


  sort holdoutsample jumlahkpm
  g temprand2 = uniform() if holdoutsample == 1
  egen rankholdoutsample = rank(temprand2) if holdoutsample == 1

  *generate running sum of jumlahkpm in holdoutsample
  sort rankholdoutsample
  g runningsum = sum(jumlahkpm) if holdoutsample ==1
  g treatholdout = 1 if runningsum < `holdoutgoal'
  replace treatholdout = 0 if runningsum >= `holdoutgoal'

  * force at least 1 treated and at least 1 nontreated
  replace treatholdout = 1 if rankholdoutsample == 1 | rankholdoutsample == 2
  replace treatholdout = 0 if rankholdoutsample == `numleaveout' | rankholdoutsample == `numleaveout' - 1


  display `holdoutgoal'
//   list holdoutsample rankhold jumlahkpm running treatholdout if holdoutsample == 1
  g finalstratum = stratum
  replace finalstratum = 1000 if holdoutsample == 1
  g treated_nonholdout = treated
  replace treated = 1 if treatholdout == 1 & holdoutsample == 1
  replace treated = 0 if treatholdout == 0 & holdoutsample == 1


  ****
  * merge back in pilot
  ***
  keep namaprovinsi namakabupaten treated finalstratum treated_nonholdout holdoutsample provgroup
  sort namaprovinsi namakabupaten
  merge 1:1 namaprovinsi namakabupaten using `temp_all'
//   tab _merge
  drop _merge

  g javatreatforsure = 1 if (namaprov == "BANTEN" | namaprov == "JAWA BARAT" | namaprov == "D I YOGYAKARTA"  | namaprov == "JAWA TENGAH" | namaprov == "JAWA TIMUR") & no <= 80
  // replace pilot = . if javatreatforsure == 1
  /* NOTE: treatments.xlsx file used in "170829 Phase in Randomization.do" is not produced by "170808 Kab Randomization.do" (instead it produces treatments.csv). An earlier version
  		of "170808 Kab Randomization.do" must have produced it. The only difference is that treatments.xlsx includes variable pilotjava, which appears to be equal to 1 when pilot
  		= 1 and javatreatforsure = 1. This produces 9 pilot districts. To make sure fake_treatments also has 9 districts, I comment out the above line. */
  sort namaprovinsi namakabupaten
//   list namaprovinsi namakabupaten treated pilot javatreatforsure if treated == 1 | pilot == 1 | javatreatforsure == 1

  count if treated == 1 | pilot == 1 | javatreatforsure == 1


  // replace treated = . if javatreatforsure == 1
  // NOTE: This line added to be consistent with treatments.xlsx

  count if pilot == 1
  assert `r(N)' == 9

  count if treated == 1
  count if treated == 0
//   list namakabupaten if treated == 1
//   list namakabupaten if treated == 0
  // check that there are 9 pilot districts to be consistent with treatments.xlsx
  *** Outsheet treatments

  tempfile treatments
  save `treatments', replace

  keep if treated == 1 | pilot == 1 | javatreatforsure == 1
  sort namaprovinsi namakabupaten
  keep namaprovinsi namakabupaten treated pilot javatreatforsure jumlahkpm finalstratum
  tempfile t
  save `t', replace

  *** Outsheet controls
  u `treatments', clear
  drop if treated == 1 | pilot == 1 | javatreatforsure == 1
  sort namaprovinsi namakabupaten
  keep namaprovinsi namakabupaten treated pilot javatreatforsure jumlahkpm finalstratum
  tempfile c
  save `c', replace


  /*----------------------------------------------------*/
                	/* Randomize Phases */
  /*----------------------------------------------------*/

  // 39 T minus 9 pilot = randomize 30 T
  local T_phase1_n = 30
  local T_phase2_n = 69
  local T_phase3_n = 117

  local C_phase1_n = 21
  local C_phase2_n = 42
  local C_phase3_n = 63


  * randomize treatment based on number of kab
  	* randomize phase 1, based on number of kab
  	use `t', clear
  	gen phase1 = 1 if pilot == 1

  	gen rand = runiform() if phase1 == .
  	sort rand
  	gen n = _n if rand != .
  	di `T_phase1_n'
  	replace phase1 = 1 if n <= `T_phase1_n'

  	tempfile rand1
  	save `rand1', replace

  	keep if phase1 == 1
  	tempfile phase1
  	save `phase1', replace

  	* randomize phase 2, based on number of kab
  	use `rand1', clear
  	keep if phase1 == .
  	drop phase1

  	di `T_phase2_n'
  	gen phase2 = 1 if n <= `T_phase2_n'

  	tempfile rand2
  	save `rand2', replace

  	keep if phase2 == 1
  	tempfile phase2
  	save `phase2', replace

  	* phase 3 is the remainders
  	use `rand2', clear
  	keep if phase2 == .
  	drop phase2
  	gen phase3 = 1

  	keep if phase3 == 1
  	tempfile phase3
  	save `phase3', replace

  	* combine data
  	use `phase1', clear
  	append using `phase2'
  	append using `phase3'
  	drop rand n
//   	tab1 phase?
  	sort namaprovinsi namakabupaten
    tempfile phases_T_n
    save `phases_T_n', replace

  * randomize control based on number of kab
  	* randomize phase 1, based on number of kab
  	use `c', clear
  	gen rand = runiform()
  	sort rand
  	gen n = _n
  	gen phase1 = 1 if n <= `C_phase1_n'

  	tempfile rand1
  	save `rand1', replace

  	keep if phase1 == 1
  	tempfile phase1
  	save `phase1', replace

  	* randomize phase 2, based on number of kab
  	use `rand1', clear
  	keep if phase1 == .
  	drop phase1

  	di `C_phase2_n'
  	gen phase2 = 1 if n <= `C_phase2_n'

  	tempfile rand2
  	save `rand2', replace

  	keep if phase2 == 1
  	tempfile phase2
  	save `phase2', replace

  	* phase 3 is the remainders
  	use `rand2', clear
  	keep if phase2 == .
  	drop phase2
  	gen phase3 = 1

  	keep if phase3 == 1
  	tempfile phase3
  	save `phase3', replace

  	* combine data
  	use `phase1', clear
  	append using `phase2'
  	append using `phase3'
  	drop rand n
//   	tab1 phase?
  	sort namaprovinsi namakabupaten

    * Append treatment phases
    append using `phases_T_n'
    keep if !missing(treated)
    assert _N == 105    // make sure there are 105 experimental districts

    * Save assignments
    gen treated_sep18 = treated == 1 & phase1 == 1
    keep namakabupaten finalstratum treated treated_sep18
	  rename finalstratum finalstratum`n'
	  rename treated_sep18 treated_sep18`n'
    rename treated treated`n'
    tempfile iterate`n'
    save `iterate`n'', replace

  restore
  }

u `iterate1', clear
rename finalstratum1 finalstratum
qui forvalues n = 2 / 1000 {
  local num = `n' - 1
  noi di "Merge number `num'"
  merge 1:1 namakabupaten using `iterate`n''
  assert _m == 3
  drop _m
  assert finalstratum`n' == finalstratum
  drop finalstratum`n'
}

drop finalstratum

// rename PONTIANAK to MEMPAWAH
replace namakabupaten = "MEMPAWAH" if namakabupaten == "PONTIANAK"

preserve
keep namakabupaten treated_sep18*
save "$importable/resample1000_phase.dta", replace
restore

drop treated_sep18*
save "$importable/resample1000.dta", replace

// check that ratio of treated/control is reasonable
local sum = 0
foreach var of varlist treated* {
  qui summ `var'
  local sum = `sum' + `r(mean)'
}

di `sum'/1000
di 42/105

cap log close
